#define vec2 float2
#define vec3 float3
#define vec4 float4
#define rgb zyx
#define rgba zyxw
#define _max(a,b) (a)>(b)?(a):(b)
#define _min(a,b) (a)<(b)?(a):(b)
#define _abs(a)	(float4)( (a).x>0.0f?(a).x:-(a).x, (a).y>0.0f?(a).y:-(a).y, (a).z>0.0f?(a).z:-(a).z, (a).w>0.0f?(a).w:-(a).w)
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |CLK_ADDRESS_CLAMP_TO_EDGE  | CLK_FILTER_LINEAR;

vec4 INPUT1(image2d_t src_data,  __global FilterParam* param, vec2 tc)
{
	tc = (vec2)(tc.x, tc.y)*(vec2)(param->origROI[2], param->origROI[3]) + (vec2)(param->origROI[0], param->origROI[1]);
	return read_imagef(src_data, sampler, tc).zyxw;
}

vec4 INPUT2(image2d_t ovelay1,  __global FilterParam* param, vec2 tc)
{
	return read_imagef(ovelay1, sampler, tc).zyxw;
}


__kernel void MAIN(
      __read_only image2d_t src_data,
	  __read_only image2d_t ovelay1,
      __write_only image2d_t dest_data,        //Data in global memory
	  __global FilterParam* param,
	  int alpha)  //0-100		// the gpu items/threads should be newW*newH
{
	int W = get_global_size(0);
	int H = get_global_size(1);
	int textH = param->height[0];;
	float iGlobalTime = param->cur_time / param->total_time;
	
	int2 coordinate = (int2)(get_global_id(0), get_global_id(1));
	vec2 gl_FragCoord = (vec2)(get_global_id0( param), get_global_id1( param));
	int2 iResolution = (int2)(W,H);
	vec2 tc = (gl_FragCoord + (vec2)(0.5f) )/ (float2)(W,H);
	
    vec4 inBGRA = INPUT1(src_data, param, tc);
    vec4 color = inBGRA;
    vec3 white = (vec3)(1.0f, 1.0f, 1.0f);

    float lim = 0.0f;
    vec3 cmy = white - inBGRA.xyz;
    vec3 cmy1 = (vec3)(0.0f);
    vec3 cmy2 = (vec3)(0.0f);
    vec3 cmy3 = (vec3)(0.0f);

    float maxV = max(max(inBGRA.x, inBGRA.y), inBGRA.z);
    float minV = min(min(inBGRA.x, inBGRA.y), inBGRA.z);
    float midV = max(min(inBGRA.x, inBGRA.y), min(max(inBGRA.x, inBGRA.y), inBGRA.z));
    

    if(maxV == inBGRA.x)
    {
        lim = maxV - midV;
        cmy1.x = min(cmy.x, 0.38f) * lim;
        cmy1.y = - min(cmy.y, 0.1f) * lim;
        cmy1.z = min(cmy.z, 0.2f) * lim;
    }
    

    if (minV == inBGRA.z)
    {
        lim = midV - minV;
        cmy2.x = min(cmy.x, 0.28f) * lim;
        cmy2.y = -min(cmy.y, 0.05f) * lim;
        cmy2.z = -min(cmy.x, 0.25f) * lim;
    }
    
   if(maxV < 0.5f)
    {
        lim = 1.0f - maxV - minV;
        cmy3.x = min(cmy.x, 0.06f) * lim;
        cmy3.y = -min(cmy.y, 0.07f) * lim;
        cmy3.z = min(cmy.x, 0.09f) * lim;
    }
    
    if (minV > 0.5f) 
    {
        lim = maxV + minV - 1.0f;
        cmy3.x = -min(cmy.x, 0.02f) * lim;
        cmy3.y = min(cmy.y, 0.03f) * lim;
        cmy3.z = min(cmy.x, 0.03f) * lim;
    }
    
    color.rgb = clamp(inBGRA.rgb + cmy1 + cmy2 + cmy3, 0.0f, 1.0f);
	
    color.x = INPUT2(ovelay1,  param,  (vec2)(color.x, .16666)).x;
	color.y = INPUT2(ovelay1,  param,  (vec2)(color.y, .5f)).y;
	color.z = INPUT2(ovelay1,  param,  (vec2)(color.z, .83333)).z;
	
    color.w = inBGRA.w;
	write_imagef(dest_data, coordinate, inBGRA.zyxw*(1.0f - alpha/100.0f) + color.zyxw*alpha/100.0f);
}
